import pandas as pd
import graphviz
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
import six
import sys
sys.modules['sklearn.externals.six'] = six.StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
data = pd.read_csv("winequality_red.csv")
data
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.880 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.760 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.280 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1594 | 6.2 | 0.600 | 0.08 | 2.0 | 0.090 | 32.0 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 | 5 |
| 1595 | 5.9 | 0.550 | 0.10 | 2.2 | 0.062 | 39.0 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 | 6 |
| 1596 | 6.3 | 0.510 | 0.13 | 2.3 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 | 6 |
| 1597 | 5.9 | 0.645 | 0.12 | 2.0 | 0.075 | 32.0 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 | 5 |
| 1598 | 6.0 | 0.310 | 0.47 | 3.6 | 0.067 | 18.0 | 42.0 | 0.99549 | 3.39 | 0.66 | 11.0 | 6 |
1599 rows × 12 columns
data.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 |
| mean | 8.319637 | 0.527821 | 0.270976 | 2.538806 | 0.087467 | 15.874922 | 46.467792 | 0.996747 | 3.311113 | 0.658149 | 10.422983 | 5.636023 |
| std | 1.741096 | 0.179060 | 0.194801 | 1.409928 | 0.047065 | 10.460157 | 32.895324 | 0.001887 | 0.154386 | 0.169507 | 1.065668 | 0.807569 |
| min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 |
| 25% | 7.100000 | 0.390000 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 22.000000 | 0.995600 | 3.210000 | 0.550000 | 9.500000 | 5.000000 |
| 50% | 7.900000 | 0.520000 | 0.260000 | 2.200000 | 0.079000 | 14.000000 | 38.000000 | 0.996750 | 3.310000 | 0.620000 | 10.200000 | 6.000000 |
| 75% | 9.200000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 62.000000 | 0.997835 | 3.400000 | 0.730000 | 11.100000 | 6.000000 |
| max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 |
data.isnull().any().any()
False
data.columns
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol', 'quality'],
dtype='object')
X = data.drop(columns = 'quality')
y = data['quality']
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.30, random_state= 355)
DT= DecisionTreeClassifier()
DT.fit(x_train,y_train)
DecisionTreeClassifier()
feature_n = list(X.columns)
class_n = list(y_train.unique())
feature_n
['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
# create a dot_file which stores the tree structure
dot_data = export_graphviz(DT,feature_names = feature_n,rounded = True,filled = True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png("myTree.png")
Image(graph.create_png())
DT.score(x_train,y_train)
1.0
py_pred = DT.predict(x_test)
#accuracy of our classification tree
DT.score(x_test,y_test)
0.6291666666666667
scalar = StandardScaler()
x_transform = scalar.fit_transform(X)
x_train,x_test,y_train,y_test = train_test_split(x_transform,y,test_size= 0.30,random_state=355)
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
pca = PCA()
princialComponents = pca.fit_transform(x_transform)
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)')
plt.title('Explained Variance')
plt.show()
pca = PCA(n_components=8)
new_data = pca.fit_transform(x_transform)
principal_x = pd.DataFrame(new_data,columns=['PC-1','PC-2','PC-3','PC-4','PC-5','PC-6','PC-7','PC-8'])
principal_x
| PC-1 | PC-2 | PC-3 | PC-4 | PC-5 | PC-6 | PC-7 | PC-8 | |
|---|---|---|---|---|---|---|---|---|
| 0 | -1.619530 | 0.450950 | -1.774454 | 0.043740 | 0.067014 | -0.913921 | -0.161043 | -0.282258 |
| 1 | -0.799170 | 1.856553 | -0.911690 | 0.548066 | -0.018392 | 0.929714 | -1.009829 | 0.762587 |
| 2 | -0.748479 | 0.882039 | -1.171394 | 0.411021 | -0.043531 | 0.401473 | -0.539553 | 0.597946 |
| 3 | 2.357673 | -0.269976 | 0.243489 | -0.928450 | -1.499149 | -0.131017 | 0.344290 | -0.455375 |
| 4 | -1.619530 | 0.450950 | -1.774454 | 0.043740 | 0.067014 | -0.913921 | -0.161043 | -0.282258 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1594 | -2.150500 | 0.814286 | 0.617063 | 0.407687 | -0.240936 | 0.054835 | 0.170812 | -0.355866 |
| 1595 | -2.214496 | 0.893101 | 1.807402 | 0.414003 | 0.119592 | -0.674711 | -0.607970 | -0.247640 |
| 1596 | -1.456129 | 0.311746 | 1.124239 | 0.491877 | 0.193716 | -0.506410 | -0.231082 | 0.079382 |
| 1597 | -2.270518 | 0.979791 | 0.627965 | 0.639770 | 0.067735 | -0.860408 | -0.321487 | -0.468876 |
| 1598 | -0.426975 | -0.536690 | 1.628955 | -0.391716 | 0.450482 | -0.496154 | 1.189132 | 0.042176 |
1599 rows × 8 columns
x_train,x_test,y_train,y_test=train_test_split(principal_x,y,test_size=.30,random_state=355)
DT =DecisionTreeClassifier()
DT.fit(x_train,y_train)
DT.score(x_test,y_test)
0.5958333333333333
#hyperparameter tuning
grid_param = {
'criterion': ['gini', 'entropy'],
'max_depth' : range(10,32,1),
'min_samples_leaf' : range(1,10,1),
'min_samples_split': range(2,10,1),
'splitter' : ['best', 'random']}
grid_search = GridSearchCV(estimator=DT,
param_grid=grid_param,cv=5,n_jobs=-1)
#from sklearn.model_selection import GridSearchCV
#grid_search.fit(x_train,y_train)
grid_search.fit(x_train,y_train)
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': range(10, 32),
'min_samples_leaf': range(1, 10),
'min_samples_split': range(2, 10),
'splitter': ['best', 'random']})
best_parameters = grid_search.best_params_
print(best_parameters)
{'criterion': 'entropy', 'max_depth': 23, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}
grid_search.best_score_
0.596945067264574
DT = DecisionTreeClassifier(criterion= 'entropy', max_depth= 28, min_samples_leaf= 1, min_samples_split= 2, splitter= 'best')
DT.fit(x_train,y_train)
DecisionTreeClassifier(criterion='entropy', max_depth=28)
DT.score(x_test,y_test)
0.53125
feature_n = list(X.columns)
class_n = list(y_train.unique())
dot_data = export_graphviz(DT,rounded = True,filled=True)
#draw graph
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
#let's save the model
import pickle
with open('C:\\Users\\DEVKAR\\Desktop\\DSPractice\\DecisionTreee'+ '\\modelForPrediction.sav', 'wb') as f:
pickle.dump(DT,f)
with open('C:\\Users\\DEVKAR\\Desktop\\DSPractice\\DecisionTreee'+ '\\standardScalar.sav', 'wb') as f:
pickle.dump(scalar,f)
with open('C:\\Users\\DEVKAR\\Desktop\\DSPractice\\DecisionTreee'+ '\\pca_model.sav', 'wb') as f:
pickle.dump(pca,f)